Tidycensus


The Census Bureau only released APIs for the following time frames:

get_acs()

  • American Community Survey 1-Year Data (2011-2016)
  • American Community Survey 3 Year Data (2012-2013)
  • American Community Survey 5-Year Data (2009-2016)

get_decennial()

  • Decennial US Census (1990, 2000, 2010)

Only specific geographies are available for this method:

  • "us"'
  • "region"
  • "state"
  • "county"
  • "public use microdata area"
  • "tract"

library(tidyverse)
library(tidycensus)

# devtools::install_github("austensen/acssf")
library(acssf) # # helper functions: acs_vars(), acs_sum()
out_dir <- " " # insert path to folder in which you want to store final files

census_api_key("YOUR API KEY GOES HERE")

Selecting Variables

- Finding variables

# Pull variables
acs_variables <- load_variables(2016, "acs5")

head(acs_variables)
name label concept
B00001_001 Estimate!!Total UNWEIGHTED SAMPLE COUNT OF THE POPULATION
B00002_001 Estimate!!Total UNWEIGHTED SAMPLE HOUSING UNITS
B01001_001 Estimate!!Total SEX BY AGE
B01001_002 Estimate!!Total!!Male SEX BY AGE
B01001_003 Estimate!!Total!!Male!!Under 5 years SEX BY AGE
B01001_004 Estimate!!Total!!Male!!5 to 9 years SEX BY AGE
# In R Studio, use view and filter to search for a variable
View(acs_variables)
# Search columns in a known table
acs_variables %>%
  filter(str_detect(name,"B19001")) %>%
  head()
name label concept
B19001_001 Estimate!!Total HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
B19001_002 Estimate!!Total!!Less than $10,000 HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
B19001_003 Estimate!!Total!!$10,000 to $14,999 HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
B19001_004 Estimate!!Total!!$15,000 to $19,999 HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
B19001_005 Estimate!!Total!!$20,000 to $24,999 HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
B19001_006 Estimate!!Total!!$25,000 to $29,999 HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS)
# Search for a know variable using a keyword
acs_variables %>%
  filter(str_detect(label,fixed("race", ignore_case = TRUE))) %>%
  head()
name label concept
B02001_007 Estimate!!Total!!Some other race alone RACE
B02001_008 Estimate!!Total!!Two or more races RACE
B02001_009 Estimate!!Total!!Two or more races!!Two races including Some other race RACE
B02001_010 Estimate!!Total!!Two or more races!!Two races excluding Some other race, and three or more races RACE
B03002_008 Estimate!!Total!!Not Hispanic or Latino!!Some other race alone HISPANIC OR LATINO ORIGIN BY RACE
B03002_009 Estimate!!Total!!Not Hispanic or Latino!!Two or more races HISPANIC OR LATINO ORIGIN BY RACE

- Selecting variables

vars1 <- acs_vars("B17010B_{c(4, 11, 17, 24, 31, 37)*}")

vars2 <- acs_vars("B01001_{c(3:6, 27:30)*}")

Getting Data

- Source

# Decennial 
get_decennial(geography = "state", 
              variables = c(Population = "P001001"),
              year = 2010,
              output = "wide") %>%
  head(5)
## Getting data from the 2010 decennial Census
GEOID NAME Population
01 Alabama 4779736
02 Alaska 710231
04 Arizona 6392017
05 Arkansas 2915918
06 California 37253956
# American Community Survey
get_acs(geography = "tract",
        state = "NY", 
        variables = c(Female_Newborn = "B01001_027"),
        year = 2016,
        survey = "acs5",  # acs5 = five-year estimates, acs1= one-year estimates
        output = "wide") %>%
  rename(Female_Newborn_Estimate = Female_NewbornE,
         Female_Newborn_MOE = Female_NewbornM) %>%
  head(5)
## Getting data from the 2012-2016 5-year ACS
GEOID NAME Female_Newborn_Estimate Female_Newborn_MOE
36001000100 Census Tract 1, Albany County, New York 87 54
36001000200 Census Tract 2, Albany County, New York 170 86
36001000300 Census Tract 3, Albany County, New York 241 104
36001000401 Census Tract 4.01, Albany County, New York 71 44
36001000403 Census Tract 4.03, Albany County, New York 170 94

- Whole Table

# Getting the whole table 
get_acs(geography = "county",
        state = "NY",
        table = "B19001",
        year = 2016,
        survey = "acs5",
        output = "wide") %>%
  head(3)
## Getting data from the 2012-2016 5-year ACS
## Loading ACS5 variables for 2016 from table B19001. To cache this dataset for faster access to ACS tables in the future, run this function with `cache_table = TRUE`. You only need to do this once per ACS dataset.
GEOID NAME B19001_001E B19001_001M B19001_002E B19001_002M B19001_003E B19001_003M B19001_004E B19001_004M B19001_005E B19001_005M B19001_006E B19001_006M B19001_007E B19001_007M B19001_008E B19001_008M B19001_009E B19001_009M B19001_010E B19001_010M B19001_011E B19001_011M B19001_012E B19001_012M B19001_013E B19001_013M B19001_014E B19001_014M B19001_015E B19001_015M B19001_016E B19001_016M B19001_017E B19001_017M
36001 Albany County, New York 124108 805 7606 600 6285 524 5578 517 5929 639 5026 419 5819 566 4858 460 5379 444 4663 527 9980 629 12530 723 16017 756 11693 641 7773 467 8093 576 6879 490
36003 Allegany County, New York 18032 295 1282 173 1193 156 1194 157 1114 135 988 120 1122 119 1113 129 1209 152 874 112 1605 155 1916 164 2277 196 1103 114 501 84 320 60 221 48
36005 Bronx County, New York 490740 1515 76719 1775 45398 1629 36856 1357 30086 1030 27848 995 26839 1108 24339 1193 22364 1118 18635 1096 34602 1184 39721 1432 44111 1429 26552 1007 14395 880 12865 638 9410 591

- Iterating

# Over multiple states
get_acs(geography = "tract",
        state = c("NY", "CT", "NJ", "PA", "MA", "RI"),
        variables = vars2,
        year = 2016,
        survey = "acs5",
        output = "wide") %>%
  head(3)
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
GEOID NAME B01001_003E B01001_003M B01001_004E B01001_004M B01001_005E B01001_005M B01001_006E B01001_006M B01001_027E B01001_027M B01001_028E B01001_028M B01001_029E B01001_029M B01001_030E B01001_030M
36001000100 Census Tract 1, Albany County, New York 90 58 46 45 52 41 58 38 87 54 76 53 97 63 42 38
36001000200 Census Tract 2, Albany County, New York 159 113 100 71 78 74 52 48 170 86 127 108 142 106 0 11
36001000300 Census Tract 3, Albany County, New York 165 101 192 110 177 104 149 132 241 104 105 145 171 143 47 63
# Over multiple years 
map_dfr(2012:2016, function(x) {
  get_acs(geography = "state",
          state = "NY",
          variables = "B03002_001",
          year = x,
          survey = "acs5") %>% 
    mutate(year = x)})
## Getting data from the 2008-2012 5-year ACS
## Getting data from the 2009-2013 5-year ACS
## Getting data from the 2010-2014 5-year ACS
## Getting data from the 2011-2015 5-year ACS
## Getting data from the 2012-2016 5-year ACS
GEOID NAME variable estimate moe year
36 New York B03002_001 19398125 NA 2012
36 New York B03002_001 19487053 NA 2013
36 New York B03002_001 19594330 NA 2014
36 New York B03002_001 19673174 NA 2015
36 New York B03002_001 19697457 NA 2016

Wrangling Data

- Aggregating

get_acs(geography = "tract",
        state = "NY", 
        variables = vars2,
        year = 2016,
        survey = "acs5",
        output = "wide") %>%
  mutate(pop_u18_total = acs_sum("B01001_{c(3:6, 27:30)*}E"),
         pop_u18_male = acs_sum("B01001_{c(3:6)*}E"),
         pop_u18_female = acs_sum("B01001_{c(27:30)*}E")) %>%
  select(GEOID, pop_u18_total, pop_u18_male, pop_u18_female) %>%
  head(3)
## Getting data from the 2012-2016 5-year ACS
GEOID pop_u18_total pop_u18_male pop_u18_female
36001000100 548 246 302
36001000200 828 389 439
36001000300 1247 683 564

- Normalizing

race_vars <- c(White = "B03002_003", Black = "B03002_004", Native = "B03002_005", 
               Asian = "B03002_006", HIPI = "B03002_007", Hispanic = "B03002_012")

total_pop <- "B03002_001"

ny_acs_income_data <- get_acs(geography = "county",
                              state = "NY",
                              variables = race_vars,
                              summary_var = total_pop,
                              year = 2016,
                              survey = "acs5")

ny_acs_income_data %>%
  mutate(pct = 100 * (estimate / summary_est)) %>% 
  head(3)
GEOID NAME variable estimate moe summary_est summary_moe pct
36001 Albany County, New York White 226677 292 307891 NA 73.622483
36001 Albany County, New York Black 36350 726 307891 NA 11.806126
36001 Albany County, New York Native 329 94 307891 NA 0.106856

- Largest variable

ny_acs_income_data %>%
  group_by(GEOID) %>%
  filter(estimate == max(estimate)) %>% 
  head()
GEOID NAME variable estimate moe summary_est summary_moe
36001 Albany County, New York White 226677 292 307891 NA
36003 Allegany County, New York White 45116 40 47700 NA
36005 Bronx County, New York Hispanic 796193 NA 1436785 NA
36007 Broome County, New York White 166815 90 197381 NA
36009 Cattaraugus County, New York White 71489 41 78506 NA
36011 Cayuga County, New York White 71408 85 78783 NA
ny_acs_income_data %>%
  group_by(GEOID) %>%
  filter(estimate == max(estimate)) %>% 
  group_by(variable) %>%
  tally()
variable n
Hispanic 2
White 60

- Case counts

ny_acs_income_data %>%
  filter(variable != "B19001_001") %>%  #"B19001_001" is the total count
  mutate(incgroup = case_when(
    variable < "B19001_008" ~ "below35k", 
    variable < "B19001_013" ~ "35kto75k", 
    TRUE ~ "above75k")) %>%
  group_by(NAME, incgroup) %>%
  summarize(group_est = sum(estimate)) %>%
  head(3)
NAME incgroup group_est
Albany County, New York above75k 280590
Albany County, New York below35k 18567
Allegany County, New York above75k 46659

ACS Margins of Error

Whereas the Decennial census is a literal census (or as close to one as possible), the ACS is based on a sampling. As such, the numbers are really estimates which also have margins of error because they are not literal counts.

- Scale of margin of error

ny_elder_poverty <- get_acs(geography = "tract",
                            state = "NY",
                            variables = c(male = "B17001_016", female = "B17001_030"),
                            year = 2016,
                            survey = "acs5")

moe_check <- ny_elder_poverty %>%
  filter(moe > estimate) 

nrow(moe_check) / nrow(ny_elder_poverty)
## [1] 0.8294022

- Preserving MOE when using functions

# Adding variables
moe_sum(moe = c(55, 33, 44, 12, 4),
        estimate = c(1000, 125, 300, 54, 12))
## [1] 78.80355
# Multiplying variables
moe_product(est1 = 55,
            est2 = 33,
            moe1 = 12,
            moe2 = 9)
## [1] 633.9093
# Finding the ration between variables
moe_ratio(num = 1000,
          denom = 950,
          moe_num = 200,
          moe_denom = 177)
## [1] 0.287724
# Findthing the proportion of one variable by another 
moe_prop(num = 374,
         denom = 1200,
         moe_num = 122,
         moe_denom = 333)
## [1] 0.05344178
# Example 
ny_elder_poverty <- ny_elder_poverty %>%
  group_by(GEOID) %>%
  summarize(estmf = sum(estimate),
            moemf = moe_sum(moe = moe, estimate = estimate))

moe_check <- ny_elder_poverty %>%
  filter(moemf > estmf)

nrow(moe_check) / nrow(ny_elder_poverty)
## [1] 0.6520943

Plotting

ny_county_income <- get_acs(geography = "county",
                            state = "NY",
                            variables = c(hhincome = "B19013_001"), 
                            year = 2016,
                            survey = "acs5") %>%
  mutate(NAME = str_replace(NAME, " County, New York", "")) %>%
  filter(NAME %in% c("Kings", "Queens", "New York", "Bronx", "Richmond", "Westchester", "Nassau", "Albany"))

ggplot(ny_county_income, aes(x = estimate, y = reorder(NAME, estimate))) + 
  geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) + 
  labs(title = "Median Household Income by County", 
       x = "ACS estimate (bars represent margins of error)", 
       y = "") + 
  scale_x_continuous(labels = scales::dollar)


Saving Data

write_csv(dataset, str_glue("{out_dir}/dataset.csv"))

Tigris


library(tigris)
options(tigris_class = "sf")

Getting Shapefiles

- Caching data

tigris_cache_dir("Your preferred cache directory path would go here")
options(tigris_use_cache = TRUE)

- Basic fields

# Counties
ny_counties <- counties(state = "NY")
plot(ny_counties$geometry)

# Clipped Boundaries
ny_counties_clipped <- counties(state = "NY", cb = TRUE)
plot(ny_counties_clipped$geometry)

# Census Tracts
manhattan_tracts <- tracts(state = "NY", county = "New York", cb = TRUE)
plot(manhattan_tracts$geometry)

# Water Features 
ithica_water <- area_water(state = "NY", county = "Tompkins")
plot(ithica_water$geometry)

# Roads
ny_roads <- primary_secondary_roads(state = "NY")
plot(ny_roads$geometry)

# Census Tract Year
orange_tracts_90 <- tracts(state = "NY", county = "Orange", cb = TRUE, year = 1990)
orange_tracts_16 <- tracts(state = "NY", county = "Orange", cb = TRUE, year = 2016)
plot(orange_tracts_90$geometry)

plot(orange_tracts_16$geometry)


Plotting

- Basic

plot(shapefile)

- Multiplfe files on same plot

mi_tiger <- counties("MI")
mi_cb <- counties("MI", cb = TRUE)
plot(mi_tiger$geometry)
plot(mi_cb$geometry, add = TRUE, border = "red")


Joining Data*

- Multiple shapefiles

ny_tracts <- tracts(state="NY", cb = TRUE)
nj_tracts <- tracts(state="NJ", cb = TRUE)

ny_nj_tracts <- rbind_tigris(ny_tracts, nj_tracts)
new_england <- c("ME", "NH", "VT", "MA")

ne_tracts <- map(new_england, function(x) {
  tracts(state = x, cb = TRUE)}) %>%
  rbind_tigris()

- Shapefiles and Census data

ny_counties_pop = get_acs(geography = "county",
                       state = "NY",
                       variables = "B03002_001",
                       year = 2016,
                       survey = "acs5",
                       output = "wide")

ny_counties_shp <- counties(state = "NY", cb = TRUE)

ny_counties <- left_join(ny_counties_shp, ny_counties_pop, by = "GEOID")

- Geography from Census pull

get_acs(geography = "tract",
        state = "NY",
        county = "New York", 
        variables = "B25077_001", 
        geometry = TRUE)

Visualization

ggplot(ny_counties, aes(fill = B03002_001E)) +
  geom_sf() +
  scale_fill_viridis_c(name = "Population")

ggplot(ny_counties, aes(fill = B03002_001E, color = B03002_001E)) +
  geom_sf() +
  scale_fill_viridis_c(name = "Population")+
  scale_color_viridis_c(guide = FALSE) +
  theme_minimal() + 
  coord_sf(datum = NA) + 
  labs(title = "Population by County", 
       caption = "Data source: 2012-2016 ACS.\nData acquired with the R tidycensus package.")

race_vars <- c(White = "B03002_003", Black = "B03002_004", Asian = "B03002_006", Hispanic = "B03002_012")

total_pop <- "B03002_001"

nyc_race_data <- get_acs(geography = "tract",
                         state = "NY",
                         county = c("Bronx", "New York", "Queens", "Kings", "Richmond"),
                         variables = race_vars,
                         summary_var = total_pop,
                         year = 2016,
                         survey = "acs5") %>%
  mutate(pct = 100 * (estimate / summary_est))

nyc_tracts <- tracts(state = "NY", county = c("Bronx", "New York", "Queens", "Kings", "Richmond"), cb = TRUE)

nyc_tracts_race <- left_join(nyc_tracts, nyc_race_data, by = "GEOID")

ggplot(nyc_tracts_race, aes(fill = pct, color = pct)) + 
  geom_sf() + 
  scale_fill_viridis_c(name = "Percent of Population", na.value = "white")+
  scale_color_viridis_c(guide = FALSE, na.value = "white") +
  theme_minimal() + 
  coord_sf(datum = NA) + 
  facet_wrap(~variable)

library(mapview)

data_map <- ny_counties %>%
  select(NAME.x, B03002_001E, geometry) %>%
  rename(Population = B03002_001E,
         County = NAME.x)

m <- mapview(data_map, 
         zcol = "Population", 
         legend = TRUE)
m@map
library(sf)

nyc_dots <- map(c("White", "Black", "Hispanic", "Asian"), function(group) {
  nyc_tracts_race %>%
    filter(variable == group) %>%
    st_sample(., size = .$estimate / 100) %>%
    st_sf() %>%
    mutate(group = group) 
}) %>%
  reduce(rbind) %>%
  group_by(group) %>%
  summarize()
nyc_boundary <- counties("NY", cb = TRUE) %>%
  filter(NAME %in% c("Bronx", "New York", "Queens", "Kings", "Richmond"))
ggplot() + 
  geom_sf(data = nyc_boundary, color = "black", fill = "white") + 
  geom_sf(data = nyc_dots, aes(color = group, fill = group), size = 0.1) + 
  coord_sf(datum = NA) + 
  scale_color_brewer(palette = "Set1", guide = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "The Racial Geography of NYC", 
       subtitle = "2011-2016 ACS", 
       fill = "", 
       caption = "1 dot = approximately 1,000 people.\nData acquired with the R tidycensus and tigris packages.")